Welcome!
To get these notes and the datasets we’ll be using, you’ll need to clone or download this repository: https://github.com/dsidavis/RGraphicsWorkshop
You’ll also need the ggplot2 library.
library(ggplot2)
## NB Some other libraries are loaded below. These are all optional. It's good practice to put your dependencies at the top of your code.
data_dir = 'data/'
plots_dir = 'plots/'
plot(some_object) often does something useful
bgpchcexggplot2 is an implementation of the grammar of graphics in R developed by Hadley Wickham
ggplot2
ggplot2
## Plot 0 ----
library(datasauRus)
## This is a fun collection of datasets: They look totally different when they're plotted, but have the same mean, standard deviation, and Pearson correlation coefficient.
str(datasaurus_dozen_wide, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 142 obs. of 26 variables:
## $ away_x : num 32.3 53.4 63.9 70.3 34.1 ...
## $ away_y : num 61.4 26.2 30.8 82.5 45.7 ...
## $ bullseye_x : num 51.2 59 51.9 48.2 41.7 ...
## $ bullseye_y : num 83.3 85.5 85.8 85 84 ...
## $ circle_x : num 56 50 51.3 51.2 44.4 ...
## $ circle_y : num 79.3 79 82.4 79.2 78.2 ...
## $ dino_x : num 55.4 51.5 46.2 42.8 40.8 ...
## $ dino_y : num 97.2 96 94.5 91.4 88.3 ...
## $ dots_x : num 51.1 50.5 50.2 50.1 50.6 ...
## $ dots_y : num 90.9 89.1 85.5 83.1 82.9 ...
## $ h_lines_x : num 53.4 52.8 47.1 42.4 42.7 ...
## $ h_lines_y : num 90.2 90.1 90.5 89.5 90.4 ...
## $ high_lines_x: num 57.6 51.3 50.8 37 42.9 ...
## $ high_lines_y: num 83.9 82.8 76.8 82 80.2 ...
## $ slant_down_x: num 52.9 59 56.4 37.8 39.9 ...
## $ slant_down_y: num 97.3 93.6 96.3 94.4 90.6 ...
## $ slant_up_x : num 47.7 44.6 43.9 41.6 49.2 ...
## $ slant_up_y : num 95.2 93.1 94.1 90.3 96.6 ...
## $ star_x : num 58.2 58.2 58.7 57.3 58.1 ...
## $ star_y : num 91.9 92.2 90.3 89.9 92 ...
## $ v_lines_x : num 50.5 50.3 50.2 50.3 50.5 ...
## $ v_lines_y : num 93.2 97.6 99.7 90 90 ...
## $ wide_lines_x: num 65.8 65.7 39 37.8 35.5 ...
## $ wide_lines_y: num 95.6 91.9 92.3 93.5 89.6 ...
## $ x_shape_x : num 38.3 35.8 32.8 33.7 37.2 ...
## $ x_shape_y : num 92.5 94.1 88.5 88.6 83.7 ...
str(datasaurus_dozen, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1846 obs. of 3 variables:
## $ dataset: chr "dino" "dino" "dino" "dino" ...
## $ x : num 55.4 51.5 46.2 42.8 40.8 ...
## $ y : num 97.2 96 94.5 91.4 88.3 ...
ggplot(datasaurus_dozen_wide,
aes(dino_x, dino_y)) +
geom_point()
ggplot(datasaurus_dozen,
aes(x, y)) +
geom_point() +
facet_wrap(~ dataset)
## Plot 1 ----
## Derived from figure 2: <https://www.frontiersin.org/articles/10.3389/frma.2018.00024/full>
counts_df = readRDS(paste0(data_dir, 'paper_counts.Rds'))
counts_df
## # A tibble: 27 x 3
## subject_area full sample
## <fct> <dbl> <dbl>
## 1 MEDI 0.159 0.114
## 2 BIOC 0.127 0.0851
## 3 AGRI 0.131 0.0723
## 4 ENGI 0.0582 0.0620
## 5 PHYS 0.0632 0.0574
## 6 CHEM 0.0401 0.0531
## 7 ENVI 0.0529 0.0498
## 8 MATE 0.0384 0.0483
## 9 SOCI 0.0344 0.0446
## 10 COMP 0.0324 0.0380
## # ... with 17 more rows
ggplot(counts_df, aes(subject_area)) +
## geoms
geom_point(aes(y = full, color = 'Full'), size = 3) +
geom_point(aes(y = sample, color = 'Sample'), size = 3) +
geom_segment(aes(x = subject_area, xend = subject_area,
y = full, yend = sample),
arrow = arrow(angle = 15, type = 'closed',
length = unit(.025, 'snpc'))) +
## scales
xlab('ASJC subject area') +
scale_y_continuous(labels = scales::percent_format(),
name = 'Papers in dataset') +
scale_color_brewer(palette = 'Set1',
name = 'Dataset') +
coord_flip() +
## annotations and theme
ggtitle('Sampling improves balance across subject areas',
subtitle = Sys.time()) +
annotate(geom = 'label', x = 'HEAL', y = .12, label = Sys.time()) +
theme_minimal() +
theme(legend.position = c(.8, .2),
legend.background = element_rect(fill = 'white'))
ggsave(paste0(plots_dir, 'plot_1.png'), height = 7, width = 7)
## Plot 2 ----
## Code to construct this dataset is in `climate_data.R`
temp_df = readRDS(paste0(data_dir, 'temp.Rds'))
str(temp_df)
## 'data.frame': 14577 obs. of 12 variables:
## $ id : chr "USC00042294" "USC00042294" "USC00042294" "USC00042294" ...
## $ name : chr "Davis" "Davis" "Davis" "Davis" ...
## $ date : Date, format: "2009-01-01" "2009-01-02" ...
## $ year : num 2009 2009 2009 2009 2009 ...
## $ month : chr "January" "January" "January" "January" ...
## $ day : num 1 2 3 4 5 6 7 8 9 10 ...
## $ temp_min : num 3.3 3.9 0.6 -2.8 -2.2 1.7 0 3.9 0.6 -2.8 ...
## $ temp_max : num 6.1 6.7 10.6 11.1 10 7.2 12.8 7.2 10 16.7 ...
## $ temp_delta : num 2.8 2.8 10 13.9 12.2 5.5 12.8 3.3 9.4 19.5 ...
## $ precipitation: num 0 0 0 0 5 8 0 0 0 0 ...
## $ lat : num 38.5 38.5 38.5 38.5 38.5 ...
## $ lon : num -122 -122 -122 -122 -122 ...
ggplot(temp_df, aes(temp_max, temp_delta)) +
## geoms
# geom_point() +
# geom_count() +
# geom_bin2d() +
# geom_hex(aes(color = ..count..)) +
# geom_density2d(size = 1) +
# stat_density2d(contour = FALSE, geom = 'raster', aes(fill = ..density..), show.legend = FALSE) +
stat_density2d(geom = 'polygon', aes(fill = ..level..), show.legend = FALSE) +
## facets
facet_wrap(~ name) +
## Annotations are repeated within each facet
# annotate(geom = 'label', x = 200, y = 5, label = Sys.time()) +
## scales
xlab('Daily maximum temperature (ºC)') +
ylab('Daily temperature difference (ºC)') +
scale_color_viridis_c(option = 'A') +
scale_fill_viridis_c(option = 'A') +
## themes
theme_bw() +
ggtitle('Temperature difference is associated with maximum daily temperature',
subtitle = Sys.time())
## Warning: Removed 104 rows containing non-finite values (stat_density2d).
ggsave(paste0(plots_dir, 'plot_2.png'), height = 7, width = 7)
## Warning: Removed 104 rows containing non-finite values (stat_density2d).
## Plot 3 ----
## Hack to get month labels on day-as-year
## <https://stackoverflow.com/a/39861306/3187973>
temp_df$day.date = as.Date(temp_df$day, '2009-01-01')
## Model **for Sacramento alone**
model = lm(data = temp_df, temp_delta ~ day + I(day^2),
subset = name == 'Sacramento')
predictions = predict(model, interval = 'confidence', level = .95)
predictions = as.data.frame(predictions)
predictions$day = model$model$day
predictions$day.date = as.Date(predictions$day, '2009-01-01')
ggplot(temp_df, aes(day.date, temp_delta)) +
## geoms
geom_path(aes(color = month, group = year), alpha = .3, show.legend = FALSE) +
## Separate models for each panel
geom_smooth(method = lm, formula = y ~ x + I(x^2),
color = 'red', size = 1) +
## Sacramento-only model
## Use the smooth geom to get curve + ribbon
## But set `stat = 'identity'` because we don't want to fit a (new) model
geom_smooth(data = predictions, stat = 'identity',
aes(y = fit, ymax = upr, ymin = lwr),
color = 'blue', fill = 'blue', size = 1) +
## facets
facet_wrap(~ name) +
## scales
scale_x_date(name = 'Month', date_breaks = '3 month', date_labels = '%b') +
scale_y_continuous(name = 'Daily temperature difference (max - min)',
labels = scales::math_format(expr = .x*degree*C)) +
scale_color_viridis_d(limits = month.name,
option = 'E',
direction = -1) +
# coord_polar() +
## themes
theme_bw() +
ggtitle('Sacramento has smaller temperature differences than other sites',
subtitle = Sys.time())
## Warning: Removed 104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_path).
## Note effect of aspect ratios on perception
ggsave(paste0(plots_dir, 'plot_3.png'), height = 7, width = 7)
## Warning: Removed 104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_path).
ggsave(paste0(plots_dir, 'plot_3a.png'), height = 7, width = 12)
## Warning: Removed 104 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_path).
## Plot 4 ----
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
ggplot2 (version >=3) can plot vector spatial data from sf objects using geom_sf().
if (packageVersion('ggplot2') < package_version('3.0.0')) {
stop('ggplot2 version 3 or later is required for this section')
}
This section looks at two different ways of adding basemaps under geom_sf() plots
stations = readRDS(paste0(data_dir, 'stations.Rds'))
stations.utm = readRDS(paste0(data_dir, 'stations.utm.Rds'))
stations
## Simple feature collection with 4 features and 4 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -122.1241 ymin: 38.4916 xmax: -119.9947 ymax: 38.8983
## epsg (SRID): 4326
## proj4string: +proj=longlat +ellps=WGS84 +no_defs
## # A tibble: 4 x 5
## name ghcnd lat lon geometry
## <chr> <chr> <dbl> <dbl> <POINT [°]>
## 1 Davis USC00042294 38.5 -122. (-121.7761 38.5349)
## 2 Sacramento USW00023271 38.6 -121. (-121.4183 38.5552)
## 3 Lake Tahoe USW00093230 38.9 -120. (-119.9947 38.8983)
## 4 Lake Berryessa USC00045360 38.5 -122. (-122.1241 38.4916)
ggmap approachlibrary(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## - Services other than Google Maps are deprecated
## - Requires Google account, API key
## - Doesn't seem to play nicely with coordinates other than lat/lon
## See ?register_google for links and instructions
## ***DO NOT SHARE YOUR API KEY***
# register_google(key = 'notarealkey', write = TRUE)
if (!has_google_key()) {
stop('You need a Google Maps API key. See ?register_google')
}
basemap = get_map(location = c(-121.4183, 38.5552),
maptype = 'terrain',
zoom = 8)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=38.5552,-121.4183&zoom=8&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
ggmap(basemap) +
geom_sf(data = stations, inherit.aes = FALSE) +
geom_sf_label(data = stations,
aes(label = name), nudge_y = -.1, size = 3)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
ggsave(paste0(plots_dir, 'plot_4_ggmap.png'), height = 7, width = 7)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Example with UTM coordinates
# ggmap(basemap) +
# geom_sf(data = stations.utm, inherit.aes = FALSE)
ggspatial approachlibrary(ggspatial)
## - annotation_map_tile() is slow, even with cached images
## See rosm::osm.types() for a list of basemap types
ggplot(data = stations) +
annotation_map_tile(type = 'osm', zoom = 10) +
geom_sf_label(data = stations,
aes(label = name), nudge_y = .03, size = 3) +
geom_sf() +
annotation_scale(location = 'tl') +
annotation_north_arrow(location = 'br', which_north = 'true',
style = north_arrow_minimal)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Loading required namespace: raster
## Zoom: 10
ggsave(paste0(plots_dir, 'plot_4_ggspatial.png'), height = 7, width = 7)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may
## not give correct results for longitude/latitude data
## Zoom: 10
## Example with UTM coordinates
# ggplot(data = stations.utm) +
# annotation_map_tile(type = 'osm', zoom = 10,
# cachedir = system.file('rosm.cache', package = 'ggspatial')) +
# geom_sf_label(data = stations,
# aes(label = name), nudge_y = 1000, size = 3) +
# geom_sf()
Strictly speaking, base is the name of a particular package; “base graphics” aren’t included in this package, but instead are in the graphics package. So “base graphics” is strictly incorrect. But “graphics graphics” sounds silly, and the graphics package is loaded automatically when you launch R, so everyone just calls it “base graphics.”↩